import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fbprophet import Prophet

%matplotlib inline

df = pd.read_csv('avocado.csv')
df.head(20)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
Unnamed: 0      18249 non-null int64
Date            18249 non-null object
AveragePrice    18249 non-null float64
Total Volume    18249 non-null float64
4046            18249 non-null float64
4225            18249 non-null float64
4770            18249 non-null float64
Total Bags      18249 non-null float64
Small Bags      18249 non-null float64
Large Bags      18249 non-null float64
XLarge Bags     18249 non-null float64
type            18249 non-null object
year            18249 non-null int64
region          18249 non-null object
dtypes: float64(9), int64(2), object(3)
memory usage: 1.9+ MB

#Renaming numerical column names. Saving changes to original dataframe.
df.rename(index=str, columns={"4046": "Small Hass", "4225": "Large Hass", "4770": "Extra-Large Hass"}, inplace=True)

#Dropping Unnamed: 0 column
df.drop(['Unnamed: 0'], axis=1, inplace=True)

#Making data column readable by Pandas
df['Date'] = pd.to_datetime(df['Date'])

df.head()

Analyzing categorical data to fully see the range of this set

df['region'].value_counts()

MiamiFtLauderdale      338
LosAngeles             338
Tampa                  338
West                   338
HartfordSpringfield    338
Portland               338
CincinnatiDayton       338
SouthCarolina          338
Boise                  338
Midsouth               338
Albany                 338
Detroit                338
Pittsburgh             338
Sacramento             338
California             338
Atlanta                338
Columbus               338
Seattle                338
NorthernNewEngland     338
LasVegas               338
SouthCentral           338
Indianapolis           338
NewYork                338
Chicago                338
StLouis                338
Houston                338
Northeast              338
RaleighGreensboro      338
Orlando                338
Philadelphia           338
NewOrleansMobile       338
Plains                 338
Charlotte              338
HarrisburgScranton     338
PhoenixTucson          338
Boston                 338
Southeast              338
DallasFtWorth          338
GreatLakes             338
Syracuse               338
Denver                 338
SanFrancisco           338
Roanoke                338
TotalUS                338
Nashville              338
GrandRapids            338
Louisville             338
SanDiego               338
Spokane                338
RichmondNorfolk        338
Jacksonville           338
BaltimoreWashington    338
BuffaloRochester       338
WestTexNewMexico       335
Name: region, dtype: int64

df['region'].nunique()

54

df['year'].value_counts()

2017    5722
2016    5616
2015    5615
2018    1296
Name: year, dtype: int64

df['type'].value_counts()

conventional    9126
organic         9123
Name: type, dtype: int64

Here we see that there are two different types of avocadoes: Conventional and Organic. Lets separate these two types from eachother.

conventional_frame = df[df.type == 'conventional']
organic_frame = df[df.type == 'organic']

organic_frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9123 entries, 9126 to 18248
Data columns (total 13 columns):
Date                9123 non-null datetime64[ns]
AveragePrice        9123 non-null float64
Total Volume        9123 non-null float64
Small Hass          9123 non-null float64
Large Hass          9123 non-null float64
Extra-Large Hass    9123 non-null float64
Total Bags          9123 non-null float64
Small Bags          9123 non-null float64
Large Bags          9123 non-null float64
XLarge Bags         9123 non-null float64
type                9123 non-null object
year                9123 non-null int64
region              9123 non-null object
dtypes: datetime64[ns](1), float64(9), int64(1), object(2)
memory usage: 997.8+ KB

organic_frame['AveragePrice'].mean()

1.6539986846432082

conventional_frame['AveragePrice'].mean()

1.1580396668858206

organic_frame[['AveragePrice','Small Hass']]

Finding how many regions are recorded for each type of Avocado as well as how many entries per region

regions_organic = organic_frame.groupby(organic_frame.region)
print("Total regions for Organic avocado:", len(regions_organic))
print("-------------")
for name, group in regions_organic:
    print(name, " : ", len(group))

Total regions for Organic avocado: 54
-------------
Albany  :  169
Atlanta  :  169
BaltimoreWashington  :  169
Boise  :  169
Boston  :  169
BuffaloRochester  :  169
California  :  169
Charlotte  :  169
Chicago  :  169
CincinnatiDayton  :  169
Columbus  :  169
DallasFtWorth  :  169
Denver  :  169
Detroit  :  169
GrandRapids  :  169
GreatLakes  :  169
HarrisburgScranton  :  169
HartfordSpringfield  :  169
Houston  :  169
Indianapolis  :  169
Jacksonville  :  169
LasVegas  :  169
LosAngeles  :  169
Louisville  :  169
MiamiFtLauderdale  :  169
Midsouth  :  169
Nashville  :  169
NewOrleansMobile  :  169
NewYork  :  169
Northeast  :  169
NorthernNewEngland  :  169
Orlando  :  169
Philadelphia  :  169
PhoenixTucson  :  169
Pittsburgh  :  169
Plains  :  169
Portland  :  169
RaleighGreensboro  :  169
RichmondNorfolk  :  169
Roanoke  :  169
Sacramento  :  169
SanDiego  :  169
SanFrancisco  :  169
Seattle  :  169
SouthCarolina  :  169
SouthCentral  :  169
Southeast  :  169
Spokane  :  169
StLouis  :  169
Syracuse  :  169
Tampa  :  169
TotalUS  :  169
West  :  169
WestTexNewMexico  :  166

regions_conventional = conventional_frame.groupby(conventional_frame.region)
print("Total regions for Conventional avocado:", len(regions_conventional))
print("-------------")
for name, group in regions_conventional:
    print(name, " : ", len(group))

Total regions for Conventional avocado: 54
-------------
Albany  :  169
Atlanta  :  169
BaltimoreWashington  :  169
Boise  :  169
Boston  :  169
BuffaloRochester  :  169
California  :  169
Charlotte  :  169
Chicago  :  169
CincinnatiDayton  :  169
Columbus  :  169
DallasFtWorth  :  169
Denver  :  169
Detroit  :  169
GrandRapids  :  169
GreatLakes  :  169
HarrisburgScranton  :  169
HartfordSpringfield  :  169
Houston  :  169
Indianapolis  :  169
Jacksonville  :  169
LasVegas  :  169
LosAngeles  :  169
Louisville  :  169
MiamiFtLauderdale  :  169
Midsouth  :  169
Nashville  :  169
NewOrleansMobile  :  169
NewYork  :  169
Northeast  :  169
NorthernNewEngland  :  169
Orlando  :  169
Philadelphia  :  169
PhoenixTucson  :  169
Pittsburgh  :  169
Plains  :  169
Portland  :  169
RaleighGreensboro  :  169
RichmondNorfolk  :  169
Roanoke  :  169
Sacramento  :  169
SanDiego  :  169
SanFrancisco  :  169
Seattle  :  169
SouthCarolina  :  169
SouthCentral  :  169
Southeast  :  169
Spokane  :  169
StLouis  :  169
Syracuse  :  169
Tampa  :  169
TotalUS  :  169
West  :  169
WestTexNewMexico  :  169

Now we can make predictions based on a specific region we choose from either conventional or organic avocados. Lets start with organic and choose the "TotalUS" region.

date_price = regions_organic.get_group("TotalUS")[['Date', 'AveragePrice']].reset_index(drop=True)

#fig, ax = plt.subplots(figsize=(15,10))
date_price.plot(x='Date', y='AveragePrice', kind="line",figsize=(15,10))
plt.savefig('line_organic_avgp.png',bbox_inches='tight')

#Renaming the columns so they work with the fbprophet library

date_price = date_price.rename(columns={'Date':'ds', 'AveragePrice':'y'})

#Creating & fitting a model. All of the code here is further explained on prophet quick start page https://facebook.github.io/prophet/docs/quick_start.html

m = Prophet()
m.fit(date_price)

INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
C:\Users\Omar\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):

<fbprophet.forecaster.Prophet at 0x2ceacc919e8>

# You can get a suitable dataframe that extends into the future a specified number of days using the helper method 
# Prophet.make_future_dataframe. By default it will also include the dates from the history, so we will see the 
# model fit as well.

future = m.make_future_dataframe(periods=365)

#This shows the last dates that will be forecasted up to
future.tail()

# The predict method will assign each row in future a predicted value which it names yhat. If you pass in historical dates, 
# it will provide an in-sample fit. The forecast object here is a new dataframe that includes a column yhat with the forecast, 
# as well as columns for components and uncertainty intervals.

forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

# You can plot the forecast by calling the Prophet.plot method and passing in your forecast dataframe.
fig1 = m.plot(forecast)
plt.savefig('forecast_organic.png',bbox_inches='tight')

Here we can see how the individual components of the model affect the predictions

# If you want to see the forecast components, you can use the Prophet.plot_components method. By default you’ll see the trend, 
# yearly seasonality, and weekly seasonality of the time series. If you include holidays, you’ll see those here, too.

fig2 = m.plot_components(forecast)
plt.savefig('forecast_organic_components.png',bbox_inches='tight')

Now lets instead analyze conventional avocados in the TotalUS region.

date_price = regions_conventional.get_group("TotalUS")[['Date', 'AveragePrice']].reset_index(drop=True)

date_price.plot(x='Date', y='AveragePrice', kind="line",figsize=(15,10))
plt.savefig('line_conventional_avgp.png',bbox_inches='tight')

date_price = date_price.rename(columns={'Date':'ds', 'AveragePrice':'y'})

#Creating & fitting a model. All of the code here is further explained on prophet quick start page https://facebook.github.io/prophet/docs/quick_start.html

m = Prophet()
m.fit(date_price)

INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
C:\Users\Omar\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):

<fbprophet.forecaster.Prophet at 0x2ceac8b2668>

# You can get a suitable dataframe that extends into the future a specified number of days using the helper method 
# Prophet.make_future_dataframe. By default it will also include the dates from the history, so we will see the 
# model fit as well.

future = m.make_future_dataframe(periods=365)

#This shows the last dates that will be forecasted up to
future.tail()

# The predict method will assign each row in future a predicted value which it names yhat. If you pass in historical dates, 
# it will provide an in-sample fit. The forecast object here is a new dataframe that includes a column yhat with the forecast, 
# as well as columns for components and uncertainty intervals.

forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

# You can plot the forecast by calling the Prophet.plot method and passing in your forecast dataframe.
fig1 = m.plot(forecast)
plt.savefig('forecast_conventional.png',bbox_inches='tight')

# If you want to see the forecast components, you can use the Prophet.plot_components method. By default you’ll see the trend, 
# yearly seasonality, and weekly seasonality of the time series. If you include holidays, you’ll see those here, too.

fig2 = m.plot_components(forecast)
plt.savefig('forecast_conventional_components.png',bbox_inches='tight')

	Unnamed: 0	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	type	year	region
0	0	2015-12-27	1.33	64236.62	1036.74	54454.85	48.16	8696.87	8603.62	93.25	conventional	2015	Albany
1	1	2015-12-20	1.35	54876.98	674.28	44638.81	58.33	9505.56	9408.07	97.49	conventional	2015	Albany
2	2	2015-12-13	0.93	118220.22	794.70	109149.67	130.50	8145.35	8042.21	103.14	conventional	2015	Albany
3	3	2015-12-06	1.08	78992.15	1132.00	71976.41	72.58	5811.16	5677.40	133.76	conventional	2015	Albany
4	4	2015-11-29	1.28	51039.60	941.48	43838.39	75.78	6183.95	5986.26	197.69	conventional	2015	Albany
5	5	2015-11-22	1.26	55979.78	1184.27	48067.99	43.61	6683.91	6556.47	127.44	conventional	2015	Albany
6	6	2015-11-15	0.99	83453.76	1368.92	73672.72	93.26	8318.86	8196.81	122.05	conventional	2015	Albany
7	7	2015-11-08	0.98	109428.33	703.75	101815.36	80.00	6829.22	6266.85	562.37	conventional	2015	Albany
8	8	2015-11-01	1.02	99811.42	1022.15	87315.57	85.34	11388.36	11104.53	283.83	conventional	2015	Albany
9	9	2015-10-25	1.07	74338.76	842.40	64757.44	113.00	8625.92	8061.47	564.45	conventional	2015	Albany
10	10	2015-10-18	1.12	84843.44	924.86	75595.85	117.07	8205.66	7877.86	327.80	conventional	2015	Albany
11	11	2015-10-11	1.28	64489.17	1582.03	52677.92	105.32	10123.90	9866.27	257.63	conventional	2015	Albany
12	12	2015-10-04	1.31	61007.10	2268.32	49880.67	101.36	8756.75	8379.98	376.77	conventional	2015	Albany
13	13	2015-09-27	0.99	106803.39	1204.88	99409.21	154.84	6034.46	5888.87	145.59	conventional	2015	Albany
14	14	2015-09-20	1.33	69759.01	1028.03	59313.12	150.50	9267.36	8489.10	778.26	conventional	2015	Albany
15	15	2015-09-13	1.28	76111.27	985.73	65696.86	142.00	9286.68	8665.19	621.49	conventional	2015	Albany
16	16	2015-09-06	1.11	99172.96	879.45	90062.62	240.79	7990.10	7762.87	227.23	conventional	2015	Albany
17	17	2015-08-30	1.07	105693.84	689.01	94362.67	335.43	10306.73	10218.93	87.80	conventional	2015	Albany
18	18	2015-08-23	1.34	79992.09	733.16	67933.79	444.78	10880.36	10745.79	134.57	conventional	2015	Albany
19	19	2015-08-16	1.33	80043.78	539.65	68666.01	394.90	10443.22	10297.68	145.54	conventional	2015	Albany

	Date	AveragePrice	Total Volume	Small Hass	Large Hass	Extra-Large Hass	Total Bags	Small Bags	Large Bags	type	year	region
0	2015-12-27	1.33	64236.62	1036.74	54454.85	48.16	8696.87	8603.62	93.25	conventional	2015	Albany
1	2015-12-20	1.35	54876.98	674.28	44638.81	58.33	9505.56	9408.07	97.49	conventional	2015	Albany
2	2015-12-13	0.93	118220.22	794.70	109149.67	130.50	8145.35	8042.21	103.14	conventional	2015	Albany
3	2015-12-06	1.08	78992.15	1132.00	71976.41	72.58	5811.16	5677.40	133.76	conventional	2015	Albany
4	2015-11-29	1.28	51039.60	941.48	43838.39	75.78	6183.95	5986.26	197.69	conventional	2015	Albany

	AveragePrice	Small Hass
9126	1.83	8.16
9127	1.89	30.24
9128	1.85	10.44
9129	1.84	90.29
9130	1.94	0.00
9131	1.94	13.84
9132	1.89	20.71
9133	1.88	20.08
9134	1.88	11.47
9135	1.83	49.27
9136	1.97	10.31
9137	1.90	28.65
9138	1.98	5.74
9139	1.98	13.79
9140	1.98	42.63
9141	1.99	13.86
9142	1.86	30.13
9143	1.88	17.27
9144	1.87	24.45
9145	2.00	24.56
9146	1.88	79.82
9147	2.00	17.66
9148	2.01	99.16
9149	2.08	50.86
9150	2.01	34.27
9151	2.04	50.69
9152	2.02	22.35
9153	2.09	17.59
9154	2.03	79.45
9155	1.93	25.60
...	...	...
18219	1.56	98465.26
18220	1.53	117922.52
18221	1.61	118616.17
18222	1.63	108705.28
18223	1.59	145680.62
18224	1.51	129541.43
18225	1.60	26996.28
18226	1.73	33437.98
18227	1.63	27566.25
18228	1.46	25990.60
18229	1.49	34200.18
18230	1.64	30149.00
18231	1.47	24732.55
18232	1.41	22474.66
18233	1.80	22918.40
18234	1.83	27049.44
18235	1.82	33869.12
18236	1.48	34734.97
18237	1.62	2325.30
18238	1.56	2055.35
18239	1.56	2162.67
18240	1.54	1832.24
18241	1.57	1974.26
18242	1.56	1892.05
18243	1.57	1924.28
18244	1.63	2046.96
18245	1.71	1191.70
18246	1.87	1191.92
18247	1.93	1527.63
18248	1.62	2894.77

	ds
529	2019-03-21
530	2019-03-22
531	2019-03-23
532	2019-03-24
533	2019-03-25

	ds	yhat	yhat_lower	yhat_upper
529	2019-03-21	1.777671	1.603879	1.946662
530	2019-03-22	1.785573	1.620681	1.977702
531	2019-03-23	1.793586	1.615191	1.959374
532	2019-03-24	1.801608	1.621226	1.968077
533	2019-03-25	1.809540	1.636876	1.970276

	ds	yhat	yhat_lower	yhat_upper
529	2019-03-21	1.100679	0.971947	1.233577
530	2019-03-22	1.101912	0.981304	1.245725
531	2019-03-23	1.103614	0.980032	1.237514
532	2019-03-24	1.105771	0.976225	1.240768
533	2019-03-25	1.108353	0.973552	1.242185